In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib
from IPython.display import HTML
import requests
from datetime import datetime

Import stations CSV

My first step is to find interesting stations and one of the requirements is to find some that have a good time range coverage so I will evaluate the files downloaded.

Reading the stations coordinates CSV


In [2]:
stations = pd.read_csv(
    '../../data/ncdc/ish-history.csv',
    skiprows=1, 
    names=['usaf','wban','stname','ctry','fips','state','call','lat' ,'lon' ,'elev'],
    dtype=object)
print (len(stations))
stations[:3]


30567
Out[2]:
usaf wban stname ctry fips state call lat lon elev
0 000000 99999 NYGGBUKTA GREENLAND- STA GL GL NaN NaN +73483 +021567 +00030
1 000010 99999 JAN HAYEN NO NO NaN NaN +70983 -007700 +00229
2 000020 99999 ISFJORD RADIO SPITZBERGEN NO NO NaN NaN +78067 +013633 +00079

Get correct coordinates dividing the lat/lon by 1000 and the elevation by 10. Then generate a unique index and a column that specficies if the id is from a USAF station or from WBAN. Finally drop any station that doesn't have a location to use.


In [3]:
stations['lat'] = stations.lat.apply(lambda lat: float(lat)/1000)
stations['lon'] = stations.lon.apply(lambda lon: float(lon)/1000)
stations['elev'] = stations.elev.apply(lambda e: float(e)/10.0)

In [ ]:
stations['id']     = stations.apply(lambda row : "{}-{}".format(row.usaf,row.wban),axis=1)
stations.set_index(['id'],inplace=True)
stations.dropna(how='any',subset=['lat','lon'],inplace=True)
stations.head()

In [ ]:
stations.tail()

Read CSV of observations

Instantiate the path where the NCDC data has been downloaded


In [6]:
p = Path('../../data/ncdc')

Check if we have already the stations CSV or generate it from the files. To generate the CSV the name file will be inspected but also a full read of the file will be performed in order to count the number of observations on every station.


In [7]:
gsodCSV = p.joinpath('gsod.csv')
if not gsodCSV.exists():
    ops = p.joinpath('raw').joinpath('gsod').glob('**/*.op')
    agsod = []

    for op in ops:
        try:
            data = op.name.replace('.op','').split('-')
            data.append(sum(1 for line in op.open( encoding='utf8' ))-1)
            agsod.append(data)
        except UnicodeDecodeError:
          print (op.absolute())
    dfGsod = pd.DataFrame(data = agsod, columns=['usaf','wban','year','obs'])
    dfGsod['id'] = dfGsod.apply(lambda row: "{}-{}".format(row.usaf,row.wban) ,axis=1)
    dfGsod = dfGsod.set_index(['id'])
    dfGsod[['year','obs']].to_csv(str(gsodCSV))

In [8]:
print ('Reading existing stations per year CSV')
dfGsod = pd.read_csv(str(gsodCSV),index_col=['id'])
print ("{:,} station files".format(len(dfGsod)))
print ("{:,} observations".format(dfGsod.obs.sum()))
dfGsod.head()


Reading existing stations per year CSV
404,762 station files
111,648,951 observations
Out[8]:
year obs
id
033110-99999 1931 357
105130-99999 1931 196
103610-99999 1931 1
030050-99999 1931 365
122050-99999 1931 359

Year statistics per station

Now study this dataframe, grouping by id and see the total years recorded, max and min


In [9]:
year_groups = dfGsod[['year']].groupby(level=0)
year_count  = year_groups.count()
year_max    = year_groups.max()
year_min    = year_groups.min()

years = pd.concat([year_count,year_max,year_min],axis=1)
years.columns = ['count','max','min']
years.head()


Out[9]:
count max min
id
008209-99999 1 2009 2009
008210-99999 1 2009 2009
010010-99999 41 2009 1955
010013-99999 3 1988 1986
010014-99999 24 2009 1986

Count observations per station


In [10]:
obs_groups = dfGsod[['obs']].groupby(level=0)
obs_count = obs_groups.sum()
obs_count.head()


Out[10]:
obs
id
008209-99999 27
008210-99999 32
010010-99999 14180
010013-99999 335
010014-99999 6300

Join stations and observations statistics

Now we can check if the indexes of both data frames are unique and then join them to retreive only the stations with observations


In [11]:
stations.index.is_unique and years.index.is_unique and obs_count.index.is_unique


Out[11]:
True

In [12]:
scdf = pd.concat([stations,years,obs_count],axis=1,join='inner')
scdf.head()


Out[12]:
usaf wban stname ctry fips state call lat lon elev count max min obs
id
010010-99999 010010 99999 JAN MAYEN NO NO NaN ENJA 70.933 -8.667 9 41 2009 1955 14180
010014-99999 010014 99999 SOERSTOKKEN NO NO NaN ENSO 59.783 5.350 49 24 2009 1986 6300
010015-99999 010015 99999 BRINGELAND NO NO NaN ENBL 61.383 5.867 327 23 2009 1987 8030
010016-99999 010016 99999 RORVIK/RYUM NO NO NaN NaN 64.850 11.233 14 5 1991 1987 1555
010017-99999 010017 99999 FRIGG NO NO NaN ENFR 59.933 2.417 48 18 2005 1988 4086

Finally we can study this dataset and filter the appropriate stations for our study on this first iteration


In [13]:
scdf.to_csv('stations.csv')

Get preferred stations

Next step is done at CartoDB an analysis and mapping service. From the exported stations.csv, I've created a map of the 2 stations with more years of observations for every Köppen-Geiger classification. The map is interactive, you can zoom in and click on stations to check it's attributions.


In [14]:
HTML('<iframe width="100%" height="520" frameborder="0" src="https://team.cartodb.com/u/jsanz/viz/c9d103fe-6ab1-11e5-b45b-0e674067d321/embed_map" allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>')


Out[14]:

To get those stations back to this notebook I will use CartoDB SQL API so I will execute this SQL using the CSV format so it can be read directly into a Pandas DataFrame.

WITH ranked AS (
  SELECT 
  s.id, r.gridcode,
  rank() over (partition by r.gridcode order by s.count desc, s.elev asc) pos
  FROM stations s 
  JOIN climate_regions r ON ST_Intersects(s.the_geom,r.the_geom)
), filtered as (
  SELECT 
  r.id,k.koppen,
  rank() over (partition by r.id order by k.koppen) pos
  FROM ranked r
  JOIN koppen k ON r.gridcode = k.gridcode
  WHERE pos < 3
) 
SELECT id,koppen FROM filtered WHERE POS = 1

In [15]:
query = 'https://jsanz.cartodb.com/api/v1/sql?format=csv&q=WITH+ranked+AS+(%0A++SELECT+%0A++s.id,+r.gridcode,%0A++rank()+over+(partition+by+r.gridcode+order+by+s.count+desc,+s.elev+asc)+pos%0A++FROM+stations+s+%0A++JOIN+climate_regions+r+ON+ST_Intersects(s.the_geom,r.the_geom)%0A),+filtered+as+(%0A++SELECT+%0A++r.id,k.koppen%0A++,rank()+over+(partition+by+r.id+order+by+k.koppen)+pos%0A++FROM+ranked+r%0A++JOIN+koppen+k+ON+r.gridcode+%3D+k.gridcode%0A++WHERE+pos+%3C+3+%0A)+%0ASELECT+id,koppen%0AFROM+filtered+WHERE+POS+%3D+1'
selections = pd.read_csv(query,index_col=['id'])

Check if the index of the imported dataframe is unique and then join it with our stations dataset. This join will keep only data on both data frames using the parameter join='inner'.


In [16]:
selections.index.is_unique


Out[16]:
True

In [17]:
scdfc = pd.concat([scdf,selections],axis=1,join='inner')

Getting the observations for the selected stations

Filter the observations data set (around 11M records) by the selected stations and generate the file list to read from the observations folder. First we join the stations with the counts, then we move the index to a column and finally we define the path as a concatenation of the year (on a folder) and the file name as the id and the year.


In [18]:
files_to_read = pd.merge(left=dfGsod,right=scdfc,left_index=True,right_index=True,)[['year']]
files_to_read['id'] = files_to_read.index
files_to_read['path'] = files_to_read.apply(lambda row: Path('../../data/ncdc/raw/gsod').joinpath("{0}/{1}-{0}.op".format(row.year,row.id)),axis=1)
print ("{} files to read".format(len(files_to_read)))


3634 files to read

Read the files defined previously and store the results on a new big data frame and CSV adding the Köppen classification. First we define a couple of helping functions and then we iterate over the files_to_read data frame to read the corresponding CSV and append it to the final CSV file.


In [19]:
def getId(stn,wban):
    try:
        istn = int(stn)
        iwban = int(wban)
        return "{:0>6}-{:0>5}".format(istn,iwban)
    except ValueError:
        print("{}/{}".format(stn,wban))
        
def getStationByStnWban(stn,wban):
    try:
        koppen = scdfc.loc[getId(stn,wban)].koppen
    except KeyError:
        koppen = None
    return koppen

In [20]:
observationsCSV = p.joinpath('observations.csv')
if not observationsCSV.exists():
    i = 0
    acc = 0
    for index,row in files_to_read.iterrows():
        path = row['path']
        if path.exists():
            dfObsTemp = pd.read_fwf(
                    str(path),
                    colspecs=[(0,7),(7,13),(14,18),(18,22),(25,30),(31,33),(35,41),
                              (42,44),(46,52),(53,55),(57,63),(64,66),(68,73),(74,76),
                              (78,84),(84,86),(88,93),(95,100),(102,108),(108,109),
                              (110,116),(116,117),(118,123),(123,124),(125,130),(132,138)],
                    skiprows=1,
                    names=['stn','wban','year','monthday','temp','temp_count',
                           'dewp','dewp_count','slp','slp_count','stp','stp_count',
                           'visib','visib_count','wsdp','wsdp_count','mxspd',
                           'gust','max','max_flag','min','min_flag','prcp','prc_flag','sndp','frshtt']
                    )
            dfObsTemp['koppen'] = dfObsTemp.apply(lambda row: getStationByStnWban(row.stn,row.wban),axis=1)
            dfObsTemp.to_csv(str(observationsCSV),mode='a')

            i += 1
            acc += len (dfObsTemp)

            if i % 1000 == 0:
                print("{:>8} obs".format(acc))


  315976 obs
  630799 obs
  968563 obs

In [21]:
print ('Reading observations CSV')
dfObs = pd.read_csv(str(observationsCSV),index_col=0)
dfObs = dfObs[(dfObs.stn != 'stn')]
print ("{:,} observations".format(len(dfObs)))
dfObs.head()


Reading observations CSV
1,190,166 observations
Out[21]:
stn wban year monthday temp temp_count dewp dewp_count slp slp_count ... gust max max_flag min min_flag prcp prc_flag sndp frshtt koppen
0 40180 16201 1991 101 30.4 24 22.6 24 977.1 16 ... 999.9 34.5 * 20.1 NaN 0.0 F 999.9 11000 Cfc
1 40180 16201 1991 102 36.8 23 28.6 23 961.9 8 ... 39.0 38.1 NaN 30.9 NaN 0.0 F 999.9 11000 Cfc
2 40180 16201 1991 103 35.9 24 27.6 24 965.0 7 ... 49.9 37.4 NaN 33.8 * 0.0 E 999.9 0 Cfc
3 40180 16201 1991 104 34.2 21 27.7 21 979.8 8 ... 36.9 36.1 NaN 33.1 NaN 0.0 C 999.9 0 Cfc
4 40180 16201 1991 105 32.5 24 25.6 24 983.6 8 ... 32.1 34.3 NaN 30.2 * 0.0 C 999.9 0 Cfc

5 rows × 27 columns

Performing data management operations on the dataset

Now that we have the raw dataset, we can start doing management operations. First generate a copy dataframe with only the columns we are interested in.


In [22]:
dfObs2 = dfObs.copy()[['stn','wban','year','monthday','temp','max','min','frshtt','koppen']]

Management

Generate an index using the id station and the date


In [23]:
def getDateTimeFromRow(row):
    try:
        iyear = int(row.year)
        imonth = int("{:0>4}".format(row.monthday)[0:2])
        iday = int("{:0>4}".format(row.monthday)[2:4])
        return  datetime(iyear,imonth,iday)
    except ValueError:
        return np.nan

dfObs2['id']   = dfObs2.apply(lambda row: getId(row.stn,row.wban) ,axis=1)
dfObs2['date'] = dfObs2.apply(lambda row : getDateTimeFromRow(row),axis=1)
dfObs2.set_index(['id','date'],inplace=True)

The frshtt column needs to be padded with zeros to get all the flags in the correct place. Then is possible to get the occurrence of different weather conditions


In [24]:
dfObs2['frshtt']  = dfObs2.apply(lambda row: "{:0>6}".format(row.frshtt),axis=1)
dfObs2['fog']     = dfObs2['frshtt'].apply(lambda row: row[0:1]=='1')
dfObs2['rain']    = dfObs2['frshtt'].apply(lambda row: row[1:2]=='1')
dfObs2['snow']    = dfObs2['frshtt'].apply(lambda row: row[2:3]=='1')
dfObs2['hail']    = dfObs2['frshtt'].apply(lambda row: row[3:4]=='1')
dfObs2['thunder'] = dfObs2['frshtt'].apply(lambda row: row[4:5]=='1')
dfObs2['tornado'] = dfObs2['frshtt'].apply(lambda row: row[5:6]=='1')

Recode the temperatures columns, replacing the NaN values and afterwards as numerics


In [25]:
dfObs2['tempC'] = dfObs2['temp'].replace('99.9', np.nan)
dfObs2['maxC']  = dfObs2['max'].replace('99.9', np.nan)
dfObs2['minC']  = dfObs2['min'].replace('99.9', np.nan)

dfObs2['tempC'] = dfObs2['tempC'].convert_objects(convert_numeric=True) / 10
dfObs2['maxC']  = dfObs2['maxC'].convert_objects(convert_numeric=True) / 10
dfObs2['minC']  = dfObs2['minC'].convert_objects(convert_numeric=True) / 10

Frequency tables

Frequency tables for koppen, thunders, tornados that are categorized


In [26]:
dfObs.koppen.value_counts(normalize=True)*100


Out[26]:
Cfb    4.206472
Cfa    4.092706
BSh    4.058425
Dfc    4.057249
Csa    4.037672
Aw     3.995913
ET     3.954154
Dsc    3.933317
BWh    3.926259
Csb    3.904413
Af     3.797369
Dfa    3.756619
Cfc    3.722338
BSk    3.693686
Dwb    3.629746
Dwa    3.600506
Dwc    3.593196
BWk    3.561688
Dfb    3.463550
Cwa    3.191236
Dfd    3.155610
Dwd    3.049323
Dsa    2.991263
Dsb    2.736005
Cwb    2.696851
EF     2.018542
Am     1.730095
Csc    1.673716
As     1.562219
Cwc    1.551464
Dsd    0.658396
dtype: float64

In [27]:
dfObs2.tornado.value_counts(normalize=True)*100


Out[27]:
False    99.988069
True      0.011931
dtype: float64

In [28]:
dfObs2.thunder.value_counts(normalize=True)*100


Out[28]:
False    94.976499
True      5.023501
dtype: float64

Categorize the temperatures by quantiles and then make the frequency table to confirm the categorization


In [29]:
dfObs2['temp4']=pd.qcut(dfObs2.tempC, 4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])
dfObs2['temp4'].value_counts(normalize=True)*100


Out[29]:
1=0%tile     25.066503
2=25%tile    25.058521
3=50%tile    24.940639
4=75%tile    24.934169
dtype: float64

And to get the cuts, we can group by the categorization column and get the max value for every group


In [38]:
group_by_year= dfObs2[['temp4','tempC']].groupby(['temp4'])
group_by_year.max()


Out[38]:
tempC
temp4
1=0%tile 3.57
2=25%tile 5.42
3=50%tile 7.15
4=75%tile 11.46

In [ ]: